1 - kmeans

library(tibble)
library(ggpubr)
library(plotly)

load("films_clus.RData")
# km1 <- kmeans(films_clus, centers = 10, nstart = 100)

This command returns an error, since euclidean distance can be computed only on numerical data.

cbind(colnames(films_clus), 1:20)
##       [,1]             [,2]
##  [1,] "score"          "1" 
##  [2,] "director"       "2" 
##  [3,] "year"           "3" 
##  [4,] "country"        "4" 
##  [5,] "imdb_score"     "5" 
##  [6,] "genre_1"        "6" 
##  [7,] "genre_2"        "7" 
##  [8,] "budget_mln"     "8" 
##  [9,] "boxoffice_mln"  "9" 
## [10,] "profit_ratio"   "10"
## [11,] "marcello_score" "11"
## [12,] "d_DiCaprio"     "12"
## [13,] "d_Bale"         "13"
## [14,] "d_Pitt"         "14"
## [15,] "d_Damon"        "15"
## [16,] "cum_actors"     "16"
## [17,] "d_frombook"     "17"
## [18,] "d_truestory"    "18"
## [19,] "d_rewatched"    "19"
## [20,] "where"          "20"
data_num <- films_clus[, c(1, 5, 8, 9, 10, 11, 16)]

Of course we’re losing a lot of information.

km1 <- kmeans(data_num, centers = 8, nstart = 100)
pairs(data_num[1:5], col = km1$cluster, pch = 20)

boxplot(data_num)

We have very different ranges \(\implies\) let’s scale the data:

# standardized data
scdata <- scale(data_num)
km2 <- kmeans(scdata, centers = 8, nstart = 100)
pairs(scdata[,1:5], col = km2$cluster, pch = 20)

pc <- princomp(data_num)
summary(pc)
## Importance of components:
##                             Comp.1       Comp.2       Comp.3       Comp.4
## Standard deviation     353.7789497 35.392991766 15.512142888 9.4669399713
## Proportion of Variance   0.9871038  0.009879465  0.001897765 0.0007068351
## Cumulative Proportion    0.9871038  0.996983297  0.998881062 0.9995878973
##                            Comp.5       Comp.6       Comp.7
## Standard deviation     6.12824016 3.8113323302 4.133517e-01
## Proportion of Variance 0.00029619 0.0001145651 1.347530e-06
## Cumulative Proportion  0.99988409 0.9999986525 1.000000e+00
s_pc <- princomp(scdata)
summary(s_pc)
## Importance of components:
##                          Comp.1   Comp.2   Comp.3    Comp.4     Comp.5
## Standard deviation     1.475324 1.331064 1.119814 0.8438584 0.80742929
## Proportion of Variance 0.314081 0.255661 0.180950 0.1027557 0.09407533
## Cumulative Proportion  0.314081 0.569742 0.750692 0.8534477 0.94752300
##                            Comp.6     Comp.7
## Standard deviation     0.46204299 0.38753306
## Proportion of Variance 0.03080573 0.02167127
## Cumulative Proportion  0.97832873 1.00000000

The first two principal components are not enough informative.
Let’s evaluate the results on the first two anyways:

tbb <- tibble(
  "Dim.1" = pc$scores[, 1],
  "Dim.2" = pc$scores[, 2],
  "s_Dim.1" = s_pc$scores[, 1],
  "s_Dim.2" = s_pc$scores[, 2],
  "groups" = as.factor(km1$cluster),
  "s_groups" = as.factor(km2$cluster)
)

ggscatter(tbb,
          x = "Dim.1", y = "Dim.2",
          label = NULL,
          color = "groups",
          palette = "aaas",
          xlim= c(-750, 300),
          size = 0.8,
          ellipse = TRUE,
          ellipse.type = "convex",
          main = "Unscaled data",
          subtitle = "Problem: too much different ranges"
)

ggscatter(tbb,
          x = "s_Dim.1", y = "s_Dim.2",
          label = NULL,
          color = "s_groups",
          palette = "jco",
          ylim = c(-3.5, 1.5),
          size = 0.8,
          ellipse = TRUE,
          ellipse.type = "convex",
          main = "Scaled data",
          subtitle = "Problem: first two princomp not enough informative"
)

And now let’s plot the clustering on the first three principal components:

plot_ly(
  x = s_pc$scores[, 1],
  y = s_pc$scores[, 2],
  z = s_pc$scores[, 3],
  type = "scatter3d",
  mode = "markers",
  color = as.factor(km2$cluster)
)

Note that the previous plot is interactive, you can move the axes as you want.